Module 08
mpg using horsepowerauto_mpg_column_names <- c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "car_name")
auto_mpg_spaces <- read_table("datasets/auto+mpg/auto-mpg.data", na = "?", col_names = auto_mpg_column_names[1:8])
auto_mpg_tab <- read_delim("datasets/auto+mpg/auto-mpg.data", delim = "\t", col_names = FALSE) |> select(2)
# auto_mpg_spaces
# auto_mpg_tab
auto_mpg_9 <- bind_cols(auto_mpg_spaces, auto_mpg_tab) |>
set_names(auto_mpg_column_names)
summary(auto_mpg_9) mpg cylinders displacement horsepower weight
Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
1st Qu.:17.50 1st Qu.:4.000 1st Qu.:104.2 1st Qu.: 75.0 1st Qu.:2224
Median :23.00 Median :4.000 Median :148.5 Median : 93.5 Median :2804
Mean :23.51 Mean :5.455 Mean :193.4 Mean :104.5 Mean :2970
3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:262.0 3rd Qu.:126.0 3rd Qu.:3608
Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
NA's :6
acceleration model_year origin car_name
Min. : 8.00 Min. :70.00 Min. :1.000 Length:398
1st Qu.:13.82 1st Qu.:73.00 1st Qu.:1.000 Class :character
Median :15.50 Median :76.00 Median :1.000 Mode :character
Mean :15.57 Mean :76.01 Mean :1.573
3rd Qu.:17.18 3rd Qu.:79.00 3rd Qu.:2.000
Max. :24.80 Max. :82.00 Max. :3.000
mpg cylinders displacement horsepower weight
Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
acceleration model_year origin car_name
Min. : 8.00 Min. :70.00 Min. :1.000 Length:392
1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 Class :character
Median :15.50 Median :76.00 Median :1.000 Mode :character
Mean :15.54 Mean :75.98 Mean :1.577
3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
Max. :24.80 Max. :82.00 Max. :3.000
auto_mpg_9_final |>
ggplot(aes(x = horsepower, y = mpg)) +
geom_point() +
labs(
title = "Auto MPG Dataset",
x = "Horsepower",
y = "MPG"
) +
labs(
title = "Exploratory Data Analysis (EDA) of Auto MPG dataset",
subtitle = "Understanding the relationship between Horsepower and MPG",
x = "Horsepower",
y = "MPG",
caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
) +
theme_light()
Call:
lm(formula = mpg ~ horsepower, data = auto_mpg_train_data)
Residuals:
Min 1Q Median 3Q Max
-13.5314 -3.4708 -0.0281 2.5311 12.2426
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 40.144797 0.949810 42.27 <2e-16 ***
horsepower -0.161297 0.008591 -18.77 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.602 on 196 degrees of freedom
Multiple R-squared: 0.6427, Adjusted R-squared: 0.6408
F-statistic: 352.5 on 1 and 196 DF, p-value: < 2.2e-16
1 2 3 4 5 6 7 8
13.530841 3.853039 12.724358 15.950292 3.853039 24.821610 24.821610 24.499017
9 10 11 12 13 14 15 16
25.950687 32.725149 26.111984 21.918270 25.628094 5.466006 7.885456 9.014533
17 18 19 20 21 22 23 24
25.950687 25.628094 24.015127 23.208643 25.950687 13.530841 11.111390 22.402160
25 26 27 28 29 30 31 32
28.531435 24.015127 26.273281 29.015325 30.466995 27.241061 31.434775 26.273281
33 34 35 36 37 38 39 40
15.950292 15.466402 6.595083 15.143808 14.337325 15.950292 26.111984 29.015325
41 42 43 44 45 46 47 48
26.273281 25.305501 24.499017 27.241061 25.950687 11.917874 8.208050 15.950292
49 50 51 52 53 54 55 56
14.659918 5.466006 24.015127 24.015127 25.950687 24.821610 11.111390 24.982907
57 58 59 60 61 62 63 64
25.628094 22.886050 16.756775 3.046555 25.466797 24.821610 24.015127 29.337918
65 66 67 68 69 70 71 72
27.241061 28.047544 22.402160 23.208643 17.563259 15.950292 15.950292 28.047544
73 74 75 76 77 78 79 80
24.821610 28.531435 12.724358 15.950292 16.272885 23.208643 22.402160 24.821610
81 82 83 84 85 86 87 88
22.402160 19.337522 28.047544 26.757171 24.499017 25.628094 24.821610 24.337720
89 90 91 92 93 94 95 96
21.595676 27.079764 25.305501 27.402358 26.757171 17.563259 15.950292 24.015127
97 98 99 100 101 102 103 104
23.208643 27.079764 31.757369 31.596072 24.015127 22.402160 28.047544 15.950292
105 106 107 108 109 110 111 112
16.756775 19.176226 29.176621 16.756775 23.208643 24.337720 11.111390 12.724358
113 114 115 116 117 118 119 120
16.111588 25.789391 26.757171 29.337918 27.563654 24.499017 22.402160 22.402160
121 122 123 124 125 126 127 128
32.402556 29.499215 31.757369 30.466995 22.402160 17.563259 25.628094 26.434577
129 130 131 132 133 134 135 136
22.402160 16.756775 17.724555 17.563259 24.499017 24.821610 26.434577 24.499017
137 138 139 140 141 142 143 144
21.595676 28.692731 26.434577 25.628094 19.176226 17.885852 18.369742 15.143808
145 146 147 148 149 150 151 152
29.660512 27.724951 28.692731 25.628094 28.854028 28.854028 29.660512 25.628094
153 154 155 156 157 158 159 160
21.595676 21.595676 25.628094 27.886248 30.466995 29.660512 25.628094 25.628094
161 162 163 164 165 166 167 168
28.047544 25.305501 29.660512 32.402556 29.337918 29.337918 18.853632 24.015127
169 170 171 172 173 174 175 176
25.950687 26.595874 22.402160 29.337918 30.144402 29.176621 29.983105 29.660512
177 178 179 180 181 182 183 184
29.660512 28.047544 24.015127 27.886248 21.434380 20.789193 26.434577 25.950687
185 186 187 188 189 190 191 192
26.434577 28.208841 29.176621 28.854028 29.337918 25.305501 24.660314 26.595874
193 194
25.628094 26.918468
Call:
lm(formula = mpg ~ horsepower + I(horsepower^2), data = auto_mpg_train_data)
Residuals:
Min 1Q Median 3Q Max
-14.5789 -2.2724 -0.0706 2.2250 12.0560
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 57.8265703 2.3769451 24.328 < 2e-16 ***
horsepower -0.4878086 0.0418201 -11.664 < 2e-16 ***
I(horsepower^2) 0.0013261 0.0001671 7.936 1.6e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.011 on 195 degrees of freedom
Multiple R-squared: 0.7299, Adjusted R-squared: 0.7271
F-statistic: 263.5 on 2 and 195 DF, p-value: < 2.2e-16
1 2 3 4 5 6 7 8
13.44171 15.20435 13.22392 14.49293 15.20435 23.45297 23.45297 22.98658
9 10 11 12 13 14 15 16
25.16887 38.19344 25.42461 19.63740 24.66535 14.24752 13.30957 13.07608
17 18 19 20 21 22 23 24
25.16887 24.66535 22.30689 21.22712 25.16887 13.44171 12.98724 20.21365
25 26 27 28 29 30 31 32
29.57895 22.30689 25.68300 30.48143 33.33208 27.28904 35.35187 25.68300
33 34 35 36 37 38 39 40
14.49293 14.23495 13.73554 14.07622 13.72581 14.49293 25.42461 30.48143
41 42 43 44 45 46 47 48
25.68300 24.17244 22.98658 27.28904 25.16887 13.07243 13.22959 14.49293
49 50 51 52 53 54 55 56
13.85802 14.24752 22.30689 22.30689 25.16887 23.45297 12.98724 23.69014
57 58 59 60 61 62 63 64
24.66535 20.81378 14.97595 15.78223 24.41757 23.45297 22.30689 31.09634
65 66 67 68 69 70 71 72
27.28904 28.70034 20.21365 21.22712 15.52528 14.49293 14.49293 28.70034
73 74 75 76 77 78 79 80
23.45297 29.57895 13.22392 14.49293 14.67818 21.22712 20.21365 23.45297
81 82 83 84 85 86 87 88
20.21365 16.96719 28.70034 26.47408 22.98658 24.66535 23.45297 22.75737
89 90 91 92 93 94 95 96
19.26649 27.01474 24.17244 27.56599 26.47408 15.52528 14.49293 22.30689
97 98 99 100 101 102 103 104
21.22712 27.01474 36.04635 35.69778 22.30689 20.21365 28.70034 14.49293
105 106 107 108 109 110 111 112
14.97595 16.82285 30.78756 14.97595 21.22712 22.75737 12.98724 13.22392
113 114 115 116 117 118 119 120
14.58423 24.91579 26.47408 31.09634 27.84560 22.98658 20.21365 20.21365
121 122 123 124 125 126 127 128
37.46713 31.40777 36.04635 33.33208 20.21365 15.52528 24.66535 25.94404
129 130 131 132 133 134 135 136
20.21365 14.97595 15.64310 15.52528 22.98658 23.45297 25.94404 22.98658
137 138 139 140 141 142 143 144
19.26649 29.87712 25.94404 24.66535 16.82285 15.76357 16.14091 14.07622
145 146 147 148 149 150 151 152
31.72186 28.12786 29.87712 24.66535 30.17795 30.17795 31.72186 24.66535
153 154 155 156 157 158 159 160
19.26649 19.26649 24.66535 28.41278 33.33208 31.72186 24.66535 24.66535
161 162 163 164 165 166 167 168
28.70034 24.17244 31.72186 37.46713 31.09634 31.09634 16.54211 22.30689
169 170 171 172 173 174 175 176
25.16887 26.20774 20.21365 31.09634 32.68004 30.78756 32.35799 31.72186
177 178 179 180 181 182 183 184
31.72186 28.70034 22.30689 28.41278 19.08502 18.38564 25.94404 25.16887
185 186 187 188 189 190 191 192
25.94404 28.99056 30.78756 30.17795 31.09634 24.17244 23.21845 26.20774
193 194
24.66535 26.74308
Call:
lm(formula = mpg ~ horsepower + I(horsepower^2) + I(horsepower^3),
data = auto_mpg_train_data)
Residuals:
Min 1Q Median 3Q Max
-14.5616 -2.1686 -0.0387 2.2816 12.1906
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.169e+01 6.106e+00 10.103 < 2e-16 ***
horsepower -5.936e-01 1.596e-01 -3.719 0.000261 ***
I(horsepower^2) 2.211e-03 1.298e-03 1.703 0.090240 .
I(horsepower^3) -2.272e-06 3.307e-06 -0.687 0.492869
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.017 on 194 degrees of freedom
Multiple R-squared: 0.7306, Adjusted R-squared: 0.7264
F-statistic: 175.3 on 3 and 194 DF, p-value: < 2.2e-16
1 2 3 4 5 6 7 8
13.72427 14.16573 13.50298 14.72080 14.16573 23.29990 23.29990 22.83595
9 10 11 12 13 14 15 16
25.02291 38.84003 25.28183 19.56152 24.51471 13.67405 13.22028 13.13625
17 18 19 20 21 22 23 24
25.02291 24.51471 22.16324 21.10305 25.02291 13.72427 13.21651 20.11761
25 26 27 28 29 30 31 32
29.56159 22.16324 25.54399 30.50895 33.54048 27.18566 35.72305 25.54399
33 34 35 36 37 38 39 40
14.72080 14.48018 13.41775 14.33147 13.99987 14.72080 25.28183 30.50895
41 42 43 44 45 46 47 48
25.54399 24.01930 22.83595 27.18566 25.02291 13.33430 13.18759 14.72080
49 50 51 52 53 54 55 56
14.12569 13.67405 22.16324 22.16324 25.02291 23.29990 13.21651 23.53655
57 58 59 60 61 62 63 64
24.51471 20.69999 15.16954 14.46322 24.26542 23.29990 22.16324 31.15791
65 66 67 68 69 70 71 72
27.18566 28.64519 20.11761 21.10305 15.67940 14.72080 14.72080 28.64519
73 74 75 76 77 78 79 80
23.29990 29.56159 13.50298 14.72080 14.89307 21.10305 20.11761 23.29990
81 82 83 84 85 86 87 88
20.11761 17.02466 28.64519 26.35002 22.83595 24.51471 23.29990 22.60863
89 90 91 92 93 94 95 96
19.20522 26.90380 24.01930 27.47085 26.35002 15.67940 14.72080 22.16324
97 98 99 100 101 102 103 104
21.10305 26.90380 36.47991 36.09963 22.16324 20.11761 28.64519 14.72080
105 106 107 108 109 110 111 112
15.16954 16.88932 30.83168 15.16954 21.10305 22.60863 13.21651 13.50298
113 114 115 116 117 118 119 120
14.80574 24.76721 26.35002 31.15791 27.75938 22.83595 20.11761 20.11761
121 122 123 124 125 126 127 128
38.03829 31.48764 36.47991 33.54048 20.11761 15.67940 24.51471 25.80940
129 130 131 132 133 134 135 136
20.11761 15.16954 15.78886 15.67940 22.83595 23.29990 25.80940 22.83595
137 138 139 140 141 142 143 144
19.20522 29.87392 25.80940 24.51471 16.88932 15.90085 16.25210 14.33147
145 146 147 148 149 150 151 152
31.82089 28.05127 29.87392 24.51471 30.18970 30.18970 31.82089 24.51471
153 154 155 156 157 158 159 160
19.20522 19.20522 24.51471 28.34654 33.54048 31.82089 24.51471 24.51471
161 162 163 164 165 166 167 168
28.64519 24.01930 31.82089 38.03829 31.15791 31.15791 16.62658 22.16324
169 170 171 172 173 174 175 176
25.02291 26.07807 20.11761 31.15791 32.84193 30.83168 32.49803 31.82089
177 178 179 180 181 182 183 184
31.82089 28.64519 22.16324 28.34654 19.03136 18.36418 25.80940 25.02291
185 186 187 188 189 190 191 192
25.80940 28.94724 30.83168 30.18970 31.15791 24.01930 23.06637 26.07807
193 194
24.51471 26.62526
# Create a for loop to fit the models and calculate the MSE
mpg_model_order <- c()
mpg_mse <- c()
for (i in 1:6) {
auto_mpg_lm <- lm(mpg ~ poly(horsepower, i), data = auto_mpg_train_data)
auto_mpg_test_pred <- predict(auto_mpg_lm, newdata = auto_mpg_test_data)
mpg_model_order[i] <- i
mpg_mse[i] <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred)^2)
}
mpg_model_dry_df <- tibble(
model_order = mpg_model_order,
mpg_mse
)
mpg_model_dry_dfmpg_model_dry_df |>
ggplot(aes(x = model_order, y = mpg_mse)) +
geom_point() +
geom_line() +
labs(
title = "MSE vs. Model Order",
x = "Model Order",
y = "MSE"
) +
labs(
title = "Validation Set Approach to Model Validaition",
subtitle = "Data split 50/50",
x = "Model Order",
y = "MSE",
caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
) +
scale_x_continuous(breaks = 1:6) +
theme_light()mse_seed <- function(seed_values) {
mse_values <- numeric(length(seed_values))
for (i in seq_along(seed_values)) {
set.seed(seed_values[i])
auto_mpg_train <- caret::createDataPartition(auto_mpg_9_final$mpg, p = 0.5, list = FALSE)
auto_mpg_train_data <- auto_mpg_9_final[auto_mpg_train,]
auto_mpg_test_data <- auto_mpg_9_final[-auto_mpg_train,]
auto_mpg_lm <- lm(mpg ~ horsepower, data = auto_mpg_train_data)
auto_mpg_test_pred <- predict(auto_mpg_lm, newdata = auto_mpg_test_data)
mse_values[i] <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred)^2)
}
result <- tibble::tibble(Seed = seed_values, MSE = round(mse_values, 3))
return(result)
}
# Example usage:
seed_values <- c(42, sample(1:9999, 9))
result <- mse_seed(seed_values)
resultmse_seed_poly <- function(seed_values, max_degree = 3) {
results <- tibble()
for (seed in seed_values) {
set.seed(seed)
auto_mpg_train <- caret::createDataPartition(auto_mpg_9_final$mpg, p = 0.5, list = FALSE)
auto_mpg_train_data <- auto_mpg_9_final[auto_mpg_train,]
auto_mpg_test_data <- auto_mpg_9_final[-auto_mpg_train,]
for (degree in 1:max_degree) {
formula <- as.formula(paste("mpg ~ poly(horsepower, degree =", degree, ")"))
auto_mpg_lm <- lm(formula, data = auto_mpg_train_data)
auto_mpg_test_pred <- predict(auto_mpg_lm, newdata = auto_mpg_test_data)
mse <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred)^2)
results <- rbind(results, data.frame(Seed = seed, Degree = degree, MSE = mse))
}
}
return(results)
}
# Example usage:
set.seed(42) # Set a seed for reproducibility
seed_values <- c(42, sample(1:9999, 10))
result_df <- mse_seed_poly(seed_values, max_degree = 6)
result_dfresult_df |>
ggplot(aes(x = Degree, y = MSE, color = factor(Seed))) +
geom_line(show.legend = FALSE) +
labs(
title = "MSE vs. Model Order",
x = "Model Order",
y = "MSE"
) +
labs(
title = "Validation Set Approach to Model Validaition",
subtitle = "Data split 50/50; 11 random seeds",
x = "Model Order",
y = "MSE",
caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
) +
scale_x_continuous(breaks = 1:6) +
theme_light()auto_mpg_lm <- lm(mpg ~ poly(horsepower, 2), data = auto_mpg_train_data)
auto_mpg_test_pred <- predict(auto_mpg_lm, newdata = auto_mpg_test_data)
auto_mpg_test_data |>
ggplot(aes(x = horsepower, y = mpg)) +
geom_point() +
geom_line(aes(y = auto_mpg_test_pred), color = "red") +
labs(
title = "Predicted vs. Actual Values",
x = "Horsepower",
y = "MPG"
) +
labs(
title = "Validation Set Approach",
subtitle = "Data split 50/50; Model Order 2",
x = "Horsepower",
y = "MPG",
caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
) +
theme_light()# auto_mpg_poisson <- glm(mpg ~ horsepower, data = auto_mpg_train_data, family = "poisson")
# auto_mpg_test_pred_poisson <- predict(auto_mpg_poisson, newdata = auto_mpg_test_data, type = "response")
auto_mpg_gamma <- glm(mpg ~ horsepower, data = auto_mpg_train_data, family = Gamma(link = "log"))
auto_mpg_test_pred_gamma <- predict(auto_mpg_gamma, newdata = auto_mpg_test_data, type = "response")
# mpg_mse_poisson <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred_poisson)^2)
# mpg_mse_poisson
mpg_mse_gamma <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred_gamma)^2)
mpg_mse_gamma[1] 23.80739
auto_mpg_test_data |>
ggplot(aes(x = horsepower, y = mpg)) +
geom_point() +
geom_line(aes(y = auto_mpg_test_pred_gamma), color = "blue") +
labs(
title = "Predicted vs. Actual Values",
x = "Horsepower",
y = "MPG"
) +
labs(
title = "Validation Set Approach to Model Validaition",
subtitle = "Data split 50/50; Gamma (blue line) model",
x = "Horsepower",
y = "MPG",
caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
) +
theme_light()
Call:
glm(formula = mpg ~ horsepower, data = auto_mpg_9_final)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 39.935861 0.717499 55.66 <2e-16 ***
horsepower -0.157845 0.006446 -24.49 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 24.06645)
Null deviance: 23819.0 on 391 degrees of freedom
Residual deviance: 9385.9 on 390 degrees of freedom
AIC: 2363.3
Number of Fisher Scoring iterations: 2
Call:
lm(formula = mpg ~ horsepower, data = auto_mpg_9_final)
Residuals:
Min 1Q Median 3Q Max
-13.5710 -3.2592 -0.3435 2.7630 16.9240
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 39.935861 0.717499 55.66 <2e-16 ***
horsepower -0.157845 0.006446 -24.49 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.906 on 390 degrees of freedom
Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
WE perform linear regression using the glm() function rather than the lm() function because the former can be used together with cv.glm(). The cv.glm() function is part of the boot library (functions for bootstrapping).
cv.glm() function[1] 24.23151 24.23114
\[ CV_{(n)} = \frac{1}{n} \sum_{i=1}^{n} \text{MSE}_i \]
LOOCV is a general method and can be used with any kind of predictive modeling.
Advantages
Disadvantages
# Create a function to perform LOOCV over a range of polynomial degrees
auto_mpg_cv_glm_poly_loocv <- function(max_degree = 3) {
results <- tibble()
for (degree in 1:max_degree) {
formula <- as.formula(paste("mpg ~ poly(horsepower, degree =", degree, ")"))
auto_mpg_glm <- glm(formula, data = auto_mpg_9_final)
cv_results <- cv.glm(auto_mpg_9_final, auto_mpg_glm, K = nrow(auto_mpg_9_final))
mse <- cv_results$delta[1]
results <- bind_rows(results, tibble(Degree = degree, MSE = mse))
}
return(results)
}
# Example usage:
result_df <- auto_mpg_cv_glm_poly_loocv(10)
result_df |> gt::gt() | Degree | MSE |
|---|---|
| 1 | 24.23151 |
| 2 | 19.24821 |
| 3 | 19.33498 |
| 4 | 19.42443 |
| 5 | 19.03321 |
| 6 | 18.97864 |
| 7 | 18.83305 |
| 8 | 18.96115 |
| 9 | 19.06863 |
| 10 | 19.49093 |
cv.glm() function[1] 24.30505 24.28609
\[ CV_{(k)} = \frac{1}{k} \sum_{i=1}^{k} \text{MSE}_i \]
# Create a function to perform LOOCV over a range of polynomial degrees
auto_mpg_cv_glm_poly_kfold <- function(max_degree = 3, K = 10) {
results <- tibble()
for (degree in 1:max_degree) {
formula <- as.formula(paste("mpg ~ poly(horsepower, degree =", degree, ")"))
auto_mpg_glm <- glm(formula, data = auto_mpg_9_final)
cv_results <- cv.glm(auto_mpg_9_final, auto_mpg_glm, K = K)
mse <- cv_results$delta[1]
results <- bind_rows(results, tibble(Degree = degree, MSE = mse))
}
return(results)
}
# Example usage:
result_df <- auto_mpg_cv_glm_poly_kfold(10, 10)
result_df |> gt::gt() | Degree | MSE |
|---|---|
| 1 | 24.13767 |
| 2 | 19.25900 |
| 3 | 19.36803 |
| 4 | 19.38186 |
| 5 | 19.07045 |
| 6 | 18.89805 |
| 7 | 19.62119 |
| 8 | 18.87379 |
| 9 | 19.00811 |
| 10 | 19.75573 |
user system elapsed
5.211 0.162 5.375
The bias-variance trade-off is a fundamental concept in machine learning that deals with the balance between a model’s ability to fit the training data (low bias) and its ability to generalize to new, unseen data (low variance).
Cross-validation helps us understand and manage this trade-off.
In cross-validation:
Bias:
Variance:
As we adjust model complexity:
The goal is to find the sweet spot where the model complexity balances bias and variance, minimizing overall error.
Cross-validation helps us identify this balance by:
Simulated Data of 100 pairs of returns.
p1 <- portfolio |>
ggplot() +
geom_line(aes(x = seq_along(X), y = X)) +
geom_line(aes(x = seq_along(Y), y = Y), color = "red") +
labs(
title = "Simulated Portfolio Data",
x = "Sequence",
y = "Change in Returns"
) +
annotate("text", x = 85, y = -2.5, label = "Investment X", color = "black") +
annotate("text", x = 85, y = -2.75, label = "Investment Y", color = "red") +
theme_light()
p2 <- portfolio |>
ggplot() +
geom_point(aes(x = X, y = Y)) +
labs(
title = "Scatterplot of Simulated Portfolio Data",
x = "X",
y = "Y"
) +
theme_light()
p1 + p2. . . we wish to invest a fixed sum of mony in two financial assests that yeild returns of \(X\) and \(Y\), respectively, where \(X\) and \(Y\) are random quantities. We will invest a fraction \(\alpha\) of our money in \(X\) and will invest the remaining \(1-\alpha\) in \(Y\) . . . we wish to choose \(\alpha\) to minimize the total risk, or variance of our investment.
\[ \alpha = \frac{\sigma_Y^2 - \sigma_{XY}}{\sigma_X^2 + \sigma_Y^2 - 2\sigma_{XY}} \]
where \(\sigma_X^2\) is the variance of \(X\), \(\sigma_Y^2\) is the variance of \(Y\), and \(\sigma_{XY}\) is the covariance between \(X\) and \(Y\).
(We don’t know these values, but can estimate them from the data, so in reality, we have \(\hat{\alpha}\).)
# Load required library
library(tidyverse)
# Your original data frame
test_tibble <- tibble(
x = 1:10,
y = (1:10)^2
)
# Function to stack the data frame n times
stack_dataframe <- function(df, n_repeats) {
map(1:n_repeats, \(repeat_num) {
df |>
mutate(repeat_number = repeat_num)
}) |>
list_rbind()
}
# Create the new stacked data frame (e.g., repeating 5 times)
stacked_df <- stack_dataframe(test_tibble, 5)
# View the result
print(stacked_df)# A tibble: 50 × 3
x y repeat_number
<int> <dbl> <int>
1 1 1 1
2 2 4 1
3 3 9 1
4 4 16 1
5 5 25 1
6 6 36 1
7 7 49 1
8 8 64 1
9 9 81 1
10 10 100 1
# ℹ 40 more rows
Applied Statistical Techniques